# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import json
import os
from multiprocessing.connection import Client

import numpy as np

import torch
from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
from executorch.backends.qualcomm.utils.utils import (
    generate_htp_compiler_spec,
    generate_qnn_executorch_compiler_spec,
    skip_annotation,
)
from executorch.examples.qualcomm.utils import (
    build_executorch_binary,
    make_output_dir,
    make_quantizer,
    parse_skip_delegation_node,
    setup_common_args_and_variables,
    SimpleADB,
)
from executorch.exir import to_edge
from transformers import BertTokenizer, MobileBertForSequenceClassification


def evaluate(model, data_val):
    predictions, true_vals = [], []
    for data in data_val:
        inputs = {
            "input_ids": data[0].to(torch.long),
            "attention_mask": data[1].to(torch.long),
            "labels": data[2].to(torch.long),
        }
        logits = model(**inputs)[1].detach().numpy()
        label_ids = inputs["labels"].numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    return (
        np.concatenate(predictions, axis=0),
        np.concatenate(true_vals, axis=0),
    )


def accuracy_per_class(preds, goldens, labels):
    labels_inverse = {v: k for k, v in labels.items()}
    preds_flat = np.argmax(preds, axis=1).flatten()
    goldens_flat = goldens.flatten()

    result = {}
    for golden in np.unique(goldens_flat):
        pred = preds_flat[goldens_flat == golden]
        true = goldens_flat[goldens_flat == golden]
        result.update({labels_inverse[golden]: [len(pred[pred == golden]), len(true)]})

    return result


def get_dataset(data_val):
    # prepare input data
    inputs = []
    # max_position_embeddings defaults to 512
    position_ids = torch.arange(512).expand((1, -1)).to(torch.int32)
    for data in data_val:
        data = [d.to(torch.int32) for d in data]
        # input_ids, attention_mask, token_type_ids, position_ids
        inputs.append(
            (
                *data[:2],
                torch.zeros(data[0].size(), dtype=torch.int32),
                position_ids[:, : data[0].shape[1]],
            )
        )

    return inputs


def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
    from io import BytesIO

    import pandas as pd
    import requests
    from sklearn.model_selection import train_test_split
    from torch.utils.data import (
        DataLoader,
        RandomSampler,
        SequentialSampler,
        TensorDataset,
    )
    from tqdm import tqdm
    from transformers import get_linear_schedule_with_warmup

    # grab dataset
    url = "https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv"
    content = requests.get(url, allow_redirects=True).content
    data = pd.read_csv(BytesIO(content))

    # get training / validation data
    labels = {key: index for index, key in enumerate(data.Conference.unique())}
    data["label"] = data.Conference.replace(labels)

    train, val, _, _ = train_test_split(
        data.index.values,
        data.label.values,
        test_size=0.15,
        random_state=42,
        stratify=data.label.values,
    )

    data["data_type"] = ["not_set"] * data.shape[0]
    data.loc[train, "data_type"] = "train"
    data.loc[val, "data_type"] = "val"
    data.groupby(["Conference", "label", "data_type"]).count()

    # get pre-trained mobilebert
    tokenizer = BertTokenizer.from_pretrained(
        "bert-base-uncased",
        do_lower_case=True,
    )
    model = MobileBertForSequenceClassification.from_pretrained(
        "google/mobilebert-uncased",
        num_labels=len(labels),
        return_dict=False,
    )

    # tokenize dataset
    encoded_data_train = tokenizer.batch_encode_plus(
        data[data.data_type == "train"].Title.values,
        add_special_tokens=True,
        return_attention_mask=True,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    encoded_data_val = tokenizer.batch_encode_plus(
        data[data.data_type == "val"].Title.values,
        add_special_tokens=True,
        return_attention_mask=True,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    input_ids_train = encoded_data_train["input_ids"]
    attention_masks_train = encoded_data_train["attention_mask"]
    labels_train = torch.tensor(data[data.data_type == "train"].label.values)

    input_ids_val = encoded_data_val["input_ids"]
    attention_masks_val = encoded_data_val["attention_mask"]
    labels_val = torch.tensor(data[data.data_type == "val"].label.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    epochs = args.num_epochs
    dataloader_train = DataLoader(
        dataset_train,
        sampler=RandomSampler(dataset_train),
        batch_size=batch_size,
    )
    dataloader_val = DataLoader(
        dataset_val,
        sampler=SequentialSampler(dataset_val),
        batch_size=batch_size,
        drop_last=True,
    )
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs
    )

    # start training
    if not pretrained_weight:
        for epoch in range(1, epochs + 1):
            loss_train_total = 0
            print(f"epoch {epoch}")

            for batch in tqdm(dataloader_train):
                model.zero_grad()
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[2],
                }
                loss = model(**inputs)[0]
                loss_train_total += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

            torch.save(
                model.state_dict(),
                f"{artifacts_dir}/finetuned_mobilebert_epoch_{epoch}.model",
            )

    model.load_state_dict(
        torch.load(
            (
                f"{artifacts_dir}/finetuned_mobilebert_epoch_{epochs}.model"
                if pretrained_weight is None
                else pretrained_weight
            ),
            map_location=torch.device("cpu"),
            weights_only=True,
        ),
    )

    return model.eval(), dataloader_val, labels


def main(args):
    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)

    # ensure the working directory exist.
    os.makedirs(args.artifact, exist_ok=True)

    batch_size, pte_filename = 1, "ptq_mb_qnn"
    model, data_val, labels = get_fine_tuned_mobilebert(
        args.artifact, args.pretrained_weight, batch_size
    )
    inputs = get_dataset(data_val)

    try:
        quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
    except:
        raise AssertionError(
            f"No support for quant type {args.ptq}. Support 8a8w, 16a16w and 16a4w."
        )

    if args.use_fp16:
        quant_dtype = None
        pte_filename = "mb_qnn"
        build_executorch_binary(
            model,
            inputs[0],
            args.model,
            f"{args.artifact}/{pte_filename}",
            inputs,
            skip_node_id_set=skip_node_id_set,
            skip_node_op_set=skip_node_op_set,
            quant_dtype=quant_dtype,
            shared_buffer=args.shared_buffer,
        )
    else:

        def calibrator(gm):
            for input in inputs:
                gm(*input)

        quantizer = make_quantizer(quant_dtype=quant_dtype)
        backend_options = generate_htp_compiler_spec(quant_dtype is not None)
        compiler_specs = generate_qnn_executorch_compiler_spec(
            soc_model=getattr(QcomChipset, args.model),
            backend_options=backend_options,
        )
        # skip embedding layer cause it's quantization sensitive
        graph_module, _ = skip_annotation(
            nn_module=model,
            quantizer=quantizer,
            compiler_specs=compiler_specs,
            sample_input=inputs[0],
            calibration_cb=calibrator,
            fp_node_op_set={torch.ops.aten.embedding.default},
        )
        # lower all graph again, the skipped operators will be left in CPU
        exec_prog = to_edge(
            torch.export.export(graph_module, inputs[0], strict=True),
        ).to_executorch()

        with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
            file.write(exec_prog.buffer)

    if args.compile_only:
        return

    adb = SimpleADB(
        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
        build_path=f"{args.build_folder}",
        pte_path=f"{args.artifact}/{pte_filename}.pte",
        workspace=f"/data/local/tmp/executorch/{pte_filename}",
        device_id=args.device,
        host_id=args.host,
        soc_model=args.model,
        shared_buffer=args.shared_buffer,
        target=args.target,
    )
    adb.push(inputs=inputs)
    adb.execute()

    # collect output data
    output_data_folder = f"{args.artifact}/outputs"
    make_output_dir(output_data_folder)

    adb.pull(output_path=args.artifact)

    # get torch cpu result
    cpu_preds, true_vals = evaluate(model, data_val)
    cpu_result = accuracy_per_class(cpu_preds, true_vals, labels)

    # get QNN HTP result
    htp_preds = []
    for i in range(len(data_val)):
        result = np.fromfile(
            os.path.join(output_data_folder, f"output_{i}_0.raw"),
            dtype=np.float32,
        )
        htp_preds.append(result.reshape(batch_size, -1))

    htp_result = accuracy_per_class(
        np.concatenate(htp_preds, axis=0), true_vals, labels
    )

    if args.ip and args.port != -1:
        with Client((args.ip, args.port)) as conn:
            conn.send(json.dumps({"CPU": cpu_result, "HTP": htp_result}))
    else:
        for target in zip(["CPU", "HTP"], [cpu_result, htp_result]):
            print(f"\n[{target[0]}]")
            for k, v in target[1].items():
                print(f"{k}: {v[0]}/{v[1]}")


if __name__ == "__main__":
    parser = setup_common_args_and_variables()

    parser.add_argument(
        "-a",
        "--artifact",
        help="path for storing generated artifacts by this example. Default ./mobilebert_fine_tune",
        default="./mobilebert_fine_tune",
        type=str,
    )

    parser.add_argument(
        "-p",
        "--pretrained_weight",
        help="Location of pretrained weight",
        default=None,
        type=str,
    )

    parser.add_argument(
        "--num_epochs",
        help="If no pretrained weights are provided, set number of epochs to train the model",
        default=5,
        type=int,
    )

    parser.add_argument(
        "-F",
        "--use_fp16",
        help="If specified, will run in fp16 precision and discard ptq setting",
        action="store_true",
        default=False,
    )

    parser.add_argument(
        "-P",
        "--ptq",
        help="If specified, will do PTQ quantization. default is 8bits activation and 8bits weight. Support 8a8w, 16a16w and 16a4w.",
        default="8a8w",
    )

    args = parser.parse_args()
    args.validate(args)
    try:
        main(args)
    except Exception as e:
        if args.ip and args.port != -1:
            with Client((args.ip, args.port)) as conn:
                conn.send(json.dumps({"Error": str(e)}))
        else:
            raise Exception(e)
